################### PROJECT- JCR REPLICATION - data_processing_02 ##############################################
# This R file combines the raw datasets and creates the month-level dataset for battle-related violence 
# Datasets generated from this codes include dataset2_oldschool.csv and dataset2_staggeredadoption.csv.
# They includes all variables needed for main analysis for battle-related violence and corresponding tests in appendices
# Last updated: 16th, May 2025, by Wangyin Zhao
##############################################################################################################

##############################################################################################################
# Outline of Script
# A. Preparation
#    - Set working directory
#    - Load required packages
#
# B. Import Raw Datasets
#
# C. Process Each Variable
#    1. Barangay-level battle-related event data
#    2. The data on rainfall-related natural disasters   
#       2.1 The data on rainfall-related natural disasters from EM-DAT
#.      2.2 The data on rainfall concentration
#    3. The data on the territorial setting
#       3.1 The data on the spatial lag of the territorial setting
#    4. Night Light as a control (for Appendix E)
#         
# D. Merge data for the old school DID
#   
# E. Create the second dataset for staggered adoption version
##############################################################################################################

#### A. Preparation ####
# Set working directory to Replication_Files folder
setwd("./Replication_Files")

# Import relevant packages
packages <- c("sf", "tidyverse", "plm", "exactextractr", "spdep", "zoo")
lapply(packages, library, character.only = TRUE)
########


#### B. Import raw datasets: ####
# The spatial data on the barangays from the Philippine Standard Geographic Code (PSGC) 
barangay <- st_read("./data/raw/Barangays/Barangays.shp") 

# Barangay-level battle-related event data:
conflict <- readRDS ("./data/raw/bcms_incidentlevel.rds")

# Barangay-level territorial control data:
control <- st_read("./data/raw/CNN_Brgy_Affectation_2011-2014.csv")


# The data on rainfall-related natural disasters:
# the precipitation data is from the IMERG: Integrated Multi-satellite Retrievals for GPM. 
# .nc or tif. files for the precipitation for each month and grid can be downloaded from https://gpm.nasa.gov/data/imerg (better for batch downloading) 
# https://giovanni.gsfc.nasa.gov/giovanni/#service=TmAvMp&starttime=&endtime=&dataKeyword=IMERG%20Final (for a more access version)

# the data on the nightlight 
# the nightlight data is derived from the the VIIRS Day/Night Band (DNB) onboard the Joint Polar-orbiting Satellite System (JPSS) satellites 
# tif. files for the nightlight can be downloaded from https://eogdata.mines.edu/products/vnl/#daily

########

#### C. Process each variable: ####

#### 1. barangay-level battle-related event data: #####
conflict_f <- conflict %>%
  filter(year > 2010 & year < 2015,
         who.npa == 1,
         who.afp == 1 | who.pnp == 1) %>%
  # choose the conflict between the Philippines government(PNP or AFP) and NPA
  mutate(month = as.integer(substr(date, 6, 7))) %>%
  drop_na(psgc.bgy) %>%
  group_by(psgc.bgy, year, month) %>%
  summarise(npa_count = n(), .groups = "drop") %>%
  mutate(code = paste(psgc.bgy, year, sep = "_")) %>%
  mutate(code = paste(code, month, sep = "_")) %>%
  # create the code for data merging 
  dplyr::select(code, npa_count)

########

#### 2. The data on rainfall-related natural disasters: ####
# As the sizes are too big, I have not included the raw data for all precipitation tif. files for each grid at each month
# the precipitation data is from the IMERG: Integrated Multi-satellite Retrievals for GPM. 
# .nc or tif. files for the precipitation for each month and grid can be downloaded from https://gpm.nasa.gov/data/imerg (better for batch downloading)
# https://giovanni.gsfc.nasa.gov/giovanni/#service=TmAvMp&starttime=&endtime=&dataKeyword=IMERG%20Final (for a more access version)

# I include the data on precipitation mean at the barangay and month level after processing and the codes for the processing all spatial files (.tif) 

# Processing codes: 

# rainfallfilelist <- list.files("data/phl/", full.names = T, pattern = "GIOVANNI-*") 
# rainfallfilesdf <- data.frame(filename=rainfallfilelist, yearmonth=stringr::str_split(basename(rainfallfilelist), pattern = "_") %>% 
#                    lapply(., function(x) x[4]) %>% unlist()) %>%
#                    mutate(yearmonth = str_sub(yearmonth,start=15, end=20))

# fulldf <- tibble()
# for(i in rainfallfilesdf$yearmonth){

# tt <- Sys.time()
# print(i)

# rainfallfilename <- rainfallfilesdf$filename[rainfallfilesdf$yearmonth==i] # name of corresponding NLT file at t-1

# rainfallfile <- terra::rast(rainfallfilename) # read NLT raster

# rainfallrast <- terra::crop(rainfallfile, phl) # crop nlt raster to gfo raster
# rainfallrast <- terra::mask(rainfallrast, phl)

# tmpres <- tibble(rainfallfile=rainfallfilename,
#                  ADM4_PCODE=phl$ADM4_PCODE,
#                  yearmonth=i,
#                  rainfallmean=exact_extract(rainfallrast, phl, "mean"))

# fulldf <- bind_rows(fulldf,tmpres)
# print(Sys.time()-tt)}

# Below codes are about coding the occurence of natural disasters at the barangay-year level step by step
# First, upload the processing data for the precipitation mean at the barangay-month level and calculate the z-value
fulldf <- read.csv("./data/analysis/rainfall_zonal stat for phl.csv")  %>%
  mutate(yearmonth = as.numeric(yearmonth)) %>%
  dplyr::select(ADM4_PCODE, yearmonth, rainfallmean) %>%
  mutate(
    year = as.integer(str_sub(yearmonth, 1, 4)),
    month = as.integer(str_sub(yearmonth, 5, 6))) # some data cleaning


output_f <- fulldf %>%
  group_by(ADM4_PCODE, year, month) %>%
  summarise(rainfallmean = mean(rainfallmean, na.rm = TRUE), .groups = "drop") %>%
  arrange(ADM4_PCODE, year, month) %>%
  group_by(ADM4_PCODE, month) %>%
  mutate(
    roll = rollmean(rainfallmean, k = 11, fill = NA, align = "center"),
    roll_sd = rollapply(rainfallmean, width = 11, FUN = sd, fill = NA, align = "center")
  ) %>%
  ungroup() %>%
  filter(year > 2008, year < 2015) %>%
  mutate(z_value = (rainfallmean - roll) / roll_sd) %>%
  drop_na() # this code creates the z_value for the precipitation mean for each barangay at each month 

nd_battle <- output_f %>% 
  mutate(z_value = as.numeric(z_value),
         ND_1.5_bar = if_else(z_value >= 1.5, 1, 0),
         ND_1.6_bar = if_else(z_value >= 1.6, 1, 0),
         ND_1.7_bar = if_else(z_value >= 1.7, 1, 0),
         ND_1.8_bar = if_else(z_value >= 1.8, 1, 0),
         ND_1.9_bar = if_else(z_value >= 1.9, 1, 0),
         ND_2_bar = if_else(z_value >= 2, 1, 0)) %>%
  rename(z_value_bar = `z_value`) %>%
  na.omit()

davao <- st_read("data/analysis/davao_region.csv") #this data includes barangays in the Davao region

nd_battle <- nd_battle %>% filter(ADM4_PCODE %in% davao$ADM4_PCODE) %>%
  filter(year > 2010 & year < 2015) %>% mutate(ADM4_PCODE = str_replace(ADM4_PCODE, "^PH0*", "")) %>%
  mutate(code = paste(ADM4_PCODE, year, sep = "_")) %>%
  mutate(code = paste(code, month, sep = "_")) 

########

### 2.1 The data on rainfall-related natural disasters from EM-DAT ####
# I manually coded the occurrence of natural disasters based on location column from the EM-DAT
em_dat <- read_csv("data/analysis/EM-DAT Measure.csv") %>%
  mutate(month = as.numeric(month),
         nd_em_dat = 1,
         ADM4_PCODE = str_replace(ADM4_PCODE, "^PH0*", ""))

########

#### 2.2 The data on rainfall concentration: ####
# To calculate monthly level rainfall concentrations, 
# I use the precipitation data at the day and grid level derived from the IMERG: Integrated Multi-satellite Retrievals for GPM.
# As the sizes are too big, I have not included the raws data for all precipitation .nc files for each grid at each day

# .nc files for the precipitation for each day and grid can be downloaded from https://gpm.nasa.gov/data/imerg (better for batch downloading)

# First, I downloaded all .nc4 (NETCDF) files for every grid-day;
# Second, I conducted the conversion and clipping of a batch of .nc4 (NetCDF) files to .tif (GeoTIFF) format for spatial analysis, focusing on the Philippines.
# The second step is conducted using Bash scripts in the macOS Terminal with GDAL tools - the Command-line Bash scripts can be found in the replication file - code
# Third, I used R to loop through clipped all GeoTIFFs, extract the mean precipitation within each barangay polygon using "exactextractr", 
# and compile the results into a CSV file for zonal statistical analysis.

# below is the coding example

# input_dir <- "WHERE YOU STORE THE CONCERTED .TIF"  # Directory with raster files
# vector_file <- barangay  # Path to your vector file (barangy) 

# phl <- st_read(vector_file) # Load vector zones (as sf object for compatibility with exactextractr)

# raster_files <- list.files(input_dir, pattern = "\\.tif$", full.names = TRUE) ## List all raster files in the input directory

# zonal_results <- tibble() # Initialize an empty tibble to store results

# for (raster_file in raster_files) {
#   rainfallfilename <- basename(raster_file)
#   date_match <- gsub(".*\\.(\\d{4})(\\d{2})(\\d{2})-.*", "\\1-\\2-\\3", rainfallfilename) # Extract date (year, month, and day) from the filename
#   yearmonthday <- as.Date(date_match, format = "%Y-%m-%d")
  
#  rainfallrast <- rast(raster_file)
  
#  rainfallmean <- exact_extract(rainfallrast, phl, "mean") ## Calculate mean rainfall using exact_extract

#  tmpres <- tibble(
#    rainfallfile = rainfallfilename,
#    ADM4_PCODE = phl$ADM4_PCODE,
#    yearmonthday = yearmonthday,
#    rainfallsd = rainfallmean)
  
#   zonal_results <- bind_rows(zonal_results, tmpres)}

# Fourth, I calculate the monthly level rainfall concentration 
# Below are the codes example 
# zonal_results <- zonal_results %>%
#  mutate(
#    rainfallmean = as.numeric(rainfallmean),
#    year = str_sub(yearmonthday, start = 1, end = 4),
#    month = str_sub(yearmonthday, start = 6, end = 7),
#    day = str_sub(yearmonthday, start = 9, end = 10)) %>%
#  group_by(ADM4_PCODE, year, month) %>%
#  summarise(total_rainfall = sum(rainfallmean, na.rm = TRUE),
#            rci = ifelse(total_rainfall > 0,
#                 sum((rainfallmean^2), na.rm = TRUE) / (total_rainfall^2), NA))

# Finally, I created the dataset for the monthly rainfall concentration for each barangay and month 
# I rename it as "rainfall_zonal_stats_2011_2014_rainfall_concentraction" 

rainfallconcentraction <- st_read("./data/analysis/rainfall_zonal_stats_2011_2014_rainfall_concentraction.csv") %>%
  mutate(month = as.integer(month), 
         rci = as.numeric(rci)) %>%
  drop_na() %>% mutate(ADM4_PCODE = str_replace(ADM4_PCODE, "^PH0*", ""),
                       year = as.numeric(year))
########

#### 3 The data on the territorial setting: ####

# First, filter territorial control data for target regions. 
# I include both Davao region and its neighboring regions to code the territorial setting
phlT <- control %>%
  filter(Region_PSGC_Name %in% c(
    "REGION XI (DAVAO REGION)",
    "REGION XIII (Caraga)",
    "REGION X (NORTHERN MINDANAO)",
    "REGION XII (SOCCSKSARGEN)"))

# Second, filter barangays by target region codes and prepare spatial data
phl <- barangay %>%
  filter(ADM1_PCODE %in% c("PH100000000", "PH110000000", "PH120000000", "PH160000000")) %>%
  st_make_valid() %>%
  mutate(ID = substr(ADM4_PCODE, 3, 11))

# Third, merge control data with spatial geometry
phlT_1 <- merge(phl, phlT, by.x = "ID", by.y = "Brgy_PSGC_ID", all.y = TRUE) %>%
  dplyr::select(
    ADM1_PCODE, ADM2_PCODE, ADM3_PCODE, ADM4_PCODE, ADM4_EN, RuralUrban, year,
    Infl, LessInfl, Threat, aff, Infl2, aff2, Infl3) %>%
  pivot_wider(names_from = year,
    values_from = c(Infl, LessInfl, Threat, aff, Infl2, aff2, Infl3)) %>%
  mutate(across(starts_with("Infl2_"), as.numeric))

# Calculate spatial lag
phlT_1 <- st_make_valid(phlT_1)
phl_nb <- poly2nb(phlT_1, queen = TRUE)
phl_nblist <- nb2listw(phl_nb, style = "W", zero.policy = TRUE)

# 2011
phlT_1$T_inf2_2011 <- lag.listw(x = phl_nblist, var = phlT_1$Infl2_2011, zero.policy = TRUE, NAOK = TRUE) 
phlT_1$T_inf2_2011[phlT_1$T_inf2_2011 > 0 & phlT_1$T_inf2_2011 < 1] <- "Mixed"
phlT_1$T_inf2_2011[phlT_1$T_inf2_2011 == 0 & phlT_1$Infl2_2011 == 0] <- "Homogenous"
phlT_1$T_inf2_2011[phlT_1$T_inf2_2011 == 1 & phlT_1$Infl2_2011 == 1] <- "Homogenous"
phlT_1$T_inf2_2011[phlT_1$T_inf2_2011 == 0 & phlT_1$Infl2_2011 == 1] <- "Enclave"
phlT_1$T_inf2_2011[phlT_1$T_inf2_2011 == 1 & phlT_1$Infl2_2011 == 0] <- "Enclave"

# 2012
phlT_1$T_inf2_2012 <- lag.listw(x = phl_nblist, var = phlT_1$Infl2_2012, zero.policy = TRUE, NAOK = TRUE) 
phlT_1$T_inf2_2012[phlT_1$T_inf2_2012 > 0 & phlT_1$T_inf2_2012 < 1] <- "Mixed"
phlT_1$T_inf2_2012[phlT_1$T_inf2_2012 == 0 & phlT_1$Infl2_2012 == 0] <- "Homogenous"
phlT_1$T_inf2_2012[phlT_1$T_inf2_2012 == 1 & phlT_1$Infl2_2012 == 1] <- "Homogenous"
phlT_1$T_inf2_2012[phlT_1$T_inf2_2012 == 0 & phlT_1$Infl2_2012 == 1] <- "Enclave"
phlT_1$T_inf2_2012[phlT_1$T_inf2_2012 == 1 & phlT_1$Infl2_2012 == 0] <- "Enclave"

# 2013
phlT_1$T_inf2_2013 <- lag.listw(x = phl_nblist, var = phlT_1$Infl2_2013, zero.policy = TRUE, NAOK = TRUE) 
phlT_1$T_inf2_2013[phlT_1$T_inf2_2013 > 0 & phlT_1$T_inf2_2013 < 1] <- "Mixed"
phlT_1$T_inf2_2013[phlT_1$T_inf2_2013 == 0 & phlT_1$Infl2_2013 == 0] <- "Homogenous"
phlT_1$T_inf2_2013[phlT_1$T_inf2_2013 == 1 & phlT_1$Infl2_2013 == 1] <- "Homogenous"
phlT_1$T_inf2_2013[phlT_1$T_inf2_2013 == 0 & phlT_1$Infl2_2013 == 1] <- "Enclave"
phlT_1$T_inf2_2013[phlT_1$T_inf2_2013 == 1 & phlT_1$Infl2_2013 == 0] <- "Enclave"

# 2014
phlT_1$T_inf2_2014 <- lag.listw(x = phl_nblist, var = phlT_1$Infl2_2014, zero.policy = TRUE, NAOK = TRUE) 
phlT_1$T_inf2_2014[phlT_1$T_inf2_2014 > 0 & phlT_1$T_inf2_2014 < 1] <- "Mixed"
phlT_1$T_inf2_2014[phlT_1$T_inf2_2014 == 0 & phlT_1$Infl2_2014 == 0] <- "Homogenous"
phlT_1$T_inf2_2014[phlT_1$T_inf2_2014 == 1 & phlT_1$Infl2_2014 == 1] <- "Homogenous"
phlT_1$T_inf2_2014[phlT_1$T_inf2_2014 == 0 & phlT_1$Infl2_2014 == 1] <- "Enclave"
phlT_1$T_inf2_2014[phlT_1$T_inf2_2014 == 1 & phlT_1$Infl2_2014 == 0] <- "Enclave"

phl_infl2 <- phlT_1 %>% dplyr::select(ADM4_PCODE, Infl2_2011, Infl2_2012, Infl2_2013, Infl2_2014) %>%
  pivot_longer(cols = starts_with("Infl2"), 
               names_to = "year", 
               values_to = "Infl2") %>%
  mutate(year = str_sub(year,start=-4),
         ID_year = paste(ADM4_PCODE, year, sep = "_"),
         year = as.numeric(year))


phl_tinf2 <- phlT_1 %>% dplyr::select(ADM4_PCODE, T_inf2_2011, T_inf2_2012, T_inf2_2013, T_inf2_2014) %>%
  pivot_longer(cols = starts_with("T_inf2_"), 
               names_to = "year", 
               values_to = "T_inf2") %>%
  mutate(year = str_sub(year,start=-4),
         ID_year = paste(ADM4_PCODE, year, sep = "_"),
         year = as.numeric(year)) %>% st_drop_geometry()


phl_f <- phl_infl2 %>% 
  left_join(phl_tinf2) 


phl_f$Enclave_inf2[phl_f$T_inf2 == "Enclave"] <- 1
phl_f$Enclave_inf2[phl_f$T_inf2 == "Homogenous" | phl_f$T_inf2 == "Mixed"] <- 0
phl_f$Mixed_inf2[phl_f$T_inf2 == "Mixed"] <- 1
phl_f$Mixed_inf2[phl_f$T_inf2 == "Homogenous" | phl_f$T_inf2 == "Enclave"] <- 0


########

#### 3.1 The data on the spatial lag of the territorial setting: ####
# Here it bases on the dataset - phl_f generated from 2
year <- c(2011, 2012, 2013, 2014)

# Enclave 
phldf1 <- tibble()
for(i in year){
  tt <- Sys.time()
  print(i)
  phl_each_year <- phl_f %>% filter(year==i)
  phl_f_nb <- spdep::poly2nb(phl_each_year, queen = TRUE)
  phl_f_nblist <- nb2listw(phl_f_nb,style = "W", zero.policy = TRUE) 
  
  tm <- tibble(ADM4_PCODE=phl_each_year$ADM4_PCODE,
               
               time= i,
               
               Enclave_inf2_nb=lag.listw(x = phl_f_nblist, var = phl_each_year$Enclave_inf2, zero.policy = TRUE, NAOK = TRUE))
  
  tt <- Sys.time()
  print(i)
  
  phldf1 <- bind_rows(phldf1,tm)
  
  print(Sys.time()-tt)
}

phldf1$Enclave_inf2_nb_d[phldf1$Enclave_inf2_nb > 0] <- 1 
phldf1$Enclave_inf2_nb_d[phldf1$Enclave_inf2_nb == 0] <- 0


# Mixed setting
phldf2 <- tibble()
for(i in year){
  tt <- Sys.time()
  print(i)
  phl_each_year <- phl_f %>% filter(year==i)
  phl_f_nb <- spdep::poly2nb(phl_each_year, queen = TRUE)
  phl_f_nblist <- nb2listw(phl_f_nb,style = "W", zero.policy = TRUE) 
  
  tm <- tibble(ADM4_PCODE=phl_each_year$ADM4_PCODE,
               
               time= i,
               
               Mixed_inf2_nb=lag.listw(x = phl_f_nblist, var = phl_each_year$Mixed_inf2, zero.policy = TRUE, NAOK = TRUE))
  
  tt <- Sys.time()
  print(i)
  
  phldf2 <- bind_rows(phldf2,tm)
  
  print(Sys.time()-tt)
}

phldf2$Mixed_inf2_nb_d[phldf2$Mixed_inf2_nb > 0] <- 1 
phldf2$Mixed_inf2_nb_d[phldf2$Mixed_inf2_nb == 0] <- 0


phldf1 <- phldf1 %>% mutate(ID_year = paste(ADM4_PCODE, time, sep = "_")) 
phldf2 <- phldf2 %>% mutate(ID_year = paste(ADM4_PCODE, time, sep = "_"))

phldf_f <- phl_f %>% mutate(ID_year = paste(ADM4_PCODE, year, sep = "_")) %>%
  left_join(phldf1) %>%
  left_join(phldf2) %>% 
  dplyr::select(ADM4_PCODE, year, ID_year, Infl2, Enclave_inf2, Mixed_inf2, Enclave_inf2_nb_d, Mixed_inf2_nb_d) %>%
  mutate(code = str_sub(ID_year, start=3, end=16)) %>% st_drop_geometry()

########

#### 4. Night Light as a control (for Appendix E): ####
# the nightlight data is derived from the the VIIRS Day/Night Band (DNB) onboard the Joint Polar-orbiting Satellite System (JPSS) satellites 
# tif. files for the nightlight can be downloaded from https://eogdata.mines.edu/products/vnl/#daily

# below are codes for processing. After downloading the annual level nightlight data, you can replace n2012, n2013, and n2014 with your downloaded ones
# nl$n2012 <- exact_extract(n2012, barangay, "mean")
# nl$n2013 <- exact_extract(n2013, barangay, "mean")
# nl$n2014 <- exact_extract(n2014, barangay, "mean")

# nl_long <- nl %>%
#  select(-geometry) %>%  # Remove geometry if present
#  pivot_longer(cols = starts_with("n20"), names_to = "year", values_to = "value") %>%
#  mutate(year = gsub("n", "", year))  # Remove 'n' to keep only the year

# nl_long <- nl_long %>% dplyr::select(ADM4_PCODE, year, value) %>% 
#  rename(NightLight = value) %>% 
#  st_drop_geometry() 

# the data after processing is uplaoded here as nl_long
nl_long <- st_read("./data/analysis/nightlight.csv") %>% mutate(NightLight = as.numeric(NightLight),
                                                                year = as.numeric(year)) %>%
  mutate(ADM4_PCODE = str_replace(ADM4_PCODE, "^PH0*", "")) 
  

########

#### D. Merge data ####
# First, I would merge each dataset above, including conflict_f, nd_battle, phldf_f
dataset <- merge(conflict_f, nd_battle, by.x = "code", by.y = "code", all.y = TRUE) %>% 
  mutate(code = paste(ADM4_PCODE, year, sep = "_")) %>% 
  left_join(phldf_f, by = "code") %>% 
  rename("ADM4_PCODE" = "ADM4_PCODE.x") %>% 
  rename("year" = "year.x") %>% 
  mutate(year = as.numeric(year), month = as.numeric(month)) %>% 
  left_join(em_dat) %>% 
  mutate(nd_em_dat = if_else(is.na(nd_em_dat), 0, 1)) %>% 
  left_join(rainfallconcentraction) %>% left_join(nl_long) %>%
  mutate(time = paste(year, month, sep = "_")) %>% distinct() 
  
dataset$npa_count[is.na(dataset$npa_count)] <- 0
dataset_f <- dataset %>% mutate(npa = if_else(npa_count > 0, 1, 0)) 

# Second, I code the lead version of the battle-related violence 
dataset_f <- dataset_f %>% arrange(ADM4_PCODE, year, month) %>%
  group_by(ADM4_PCODE) %>%
  mutate(npa_1 = dplyr::lead(npa, n=1)) %>%
  mutate(npa_2 = dplyr::lead(npa, n=2)) %>%
  mutate(npa_3 = dplyr::lead(npa, n=3)) %>%
  mutate(npa_4 = dplyr::lead(npa, n=4)) %>%
  mutate(npa_5 = dplyr::lead(npa, n=5)) %>%
  mutate(npa_6 = dplyr::lead(npa, n=6)) %>%
  mutate(npa_7 = dplyr::lead(npa, n=7)) %>%
  mutate(npa_8 = dplyr::lead(npa, n=8)) %>%
  mutate(npa_9 = dplyr::lead(npa, n=9)) %>%
  mutate(npa_10 = dplyr::lead(npa, n=10)) %>%
  mutate(npa_11 = dplyr::lead(npa, n=11)) %>%
  mutate(npa_12 = dplyr::lead(npa, n=12))

# Third, code the lag version of the territorial setting 
dataset_f <- dataset_f %>% arrange(ADM4_PCODE,month) %>% group_by(ADM4_PCODE, month) %>%
  mutate(Infl2 = dplyr::lag(Infl2, n=1))%>%
  mutate(Enclave_inf2 = dplyr::lag(Enclave_inf2, n=1)) %>%
  mutate(Mixed_inf2 = dplyr::lag(Mixed_inf2, n=1)) %>%
  mutate(Enclave_inf2_nb_d = dplyr::lag(Enclave_inf2_nb_d, n=1)) %>%
  mutate(Mixed_inf2_nb_d = dplyr::lag(Mixed_inf2_nb_d, n=1))
  
# Fourth, code the final outcome 
dataset_f$npa_after1[dataset_f$npa_1 == 1 | dataset_f$npa_2 == 1 | dataset_f$npa_3 == 1 | dataset_f$npa_4 == 1 | dataset_f$npa_5 == 1 | dataset_f$npa_6 == 1 | dataset_f$npa_7 == 1 | dataset_f$npa_8 == 1 | dataset_f$npa_9 == 1 | dataset_f$npa_10 == 1 | dataset_f$npa_11 == 1 | dataset_f$npa_12 == 1] <- 1

dataset_f$npa_after1[is.na(dataset_f$npa_after1)] <- 0

dataset_f$npa_after2[dataset_f$npa_2 == 1 | dataset_f$npa_3 == 1 | dataset_f$npa_4 == 1 | dataset_f$npa_5 == 1 | dataset_f$npa_6 == 1 | dataset_f$npa_7 == 1 | dataset_f$npa_8 == 1 | dataset_f$npa_9 == 1 | dataset_f$npa_10 == 1 | dataset_f$npa_11 == 1 | dataset_f$npa_12 == 1] <- 1

dataset_f$npa_after2[is.na(dataset_f$npa_after2)] <- 0

dataset_f$npa_after3[dataset_f$npa_3 == 1 | dataset_f$npa_4 == 1 | dataset_f$npa_5 == 1 | dataset_f$npa_6 == 1 | dataset_f$npa_7 == 1 | dataset_f$npa_8 == 1 | dataset_f$npa_9 == 1 | dataset_f$npa_10 == 1 | dataset_f$npa_11 == 1 | dataset_f$npa_12 == 1] <- 1

dataset_f$npa_after3[is.na(dataset_f$npa_after3)] <- 0

dataset_f$npa_after4[dataset_f$npa_4 == 1 | dataset_f$npa_5 == 1 | dataset_f$npa_6 == 1 | dataset_f$npa_7 == 1 | dataset_f$npa_8 == 1 | dataset_f$npa_9 == 1 | dataset_f$npa_10 == 1 | dataset_f$npa_11 == 1 | dataset_f$npa_12 == 1] <- 1

dataset_f$npa_after4[is.na(dataset_f$npa_after4)] <- 0

dataset_f$npa_after5[dataset_f$npa_6 == 1 | dataset_f$npa_7 == 1 | dataset_f$npa_8 == 1 | dataset_f$npa_9 == 1 | dataset_f$npa_10 == 1 | dataset_f$npa_11 == 1 | dataset_f$npa_12 == 1] <- 1

dataset_f$npa_after5[is.na(dataset_f$npa_after5)] <- 0

dataset_f$npa_after6[dataset_f$npa_7 == 1 | dataset_f$npa_8 == 1 | dataset_f$npa_9 == 1 | dataset_f$npa_10 == 1 | dataset_f$npa_11 == 1 | dataset_f$npa_12 == 1] <- 1

dataset_f$npa_after6[is.na(dataset_f$npa_after6)] <- 0


# Fifth, add the lat and lon for Conley standard error in Appendix I
latlon <- barangay %>% dplyr::select(ADM4_PCODE, LATITUDE, LONGITUDE) %>% st_make_valid()
latlon <- latlon %>% filter(ADM4_PCODE != "PH097332") %>% 
  mutate(ADM4_PCODE = str_replace(ADM4_PCODE, "^PH0*", "")) %>%
  st_drop_geometry()

dataset_f <- latlon %>% mutate(ADM4_PCODE = str_replace(ADM4_PCODE, "^PH0*", "")) %>% right_join(dataset_f) 

dataset_f <- dataset_f %>% dplyr::select(ADM4_PCODE, LATITUDE, LONGITUDE, year, month, time, ID_year, code, npa_count,
                                         npa, npa_after1, npa_after2, npa_after3, npa_after4, npa_after5, npa_after6,
                                         ND_1.5_bar, ND_1.6_bar, ND_1.7_bar, ND_1.8_bar, ND_1.9_bar, ND_2_bar, nd_em_dat, z_value_bar,
                                         Infl2, Enclave_inf2, Mixed_inf2, Enclave_inf2_nb_d, Mixed_inf2_nb_d,
                                         rci, NightLight)
                                         

write_csv(dataset_f, "./data/analysis/dataset2_oldschool.csv", na = "NA") 
# the is the first month-level dataset for DID old school analysis, and can be found in the file - replication_files/data/analysis;
# it looks the battle-related violence as the outcome
########

#### E. Create the second dataset for staggered adoption version ####
# Make the staggered adoption 
# First, Convert 'time' to a proper Date object 
dataset2_f <- dataset_f
dataset2_f <- dataset2_f %>%
  mutate(year = as.numeric(year),
         month = as.numeric(month),
         numeric_time = (year - 2011) * 12 + month)

# Second, code the first hit 
dataset2_f <- dataset2_f %>%
  group_by(ADM4_PCODE) %>%
  mutate(
    first_disaster_year_month = min(numeric_time[ND_1.6_bar == 1], na.rm = TRUE),  # First disaster
    treated = ifelse(numeric_time >= first_disaster_year_month, 1, 0)  # Treated after first disaster
  ) %>%
  ungroup()


# Third, identify second disaster time
dataset2_f <- dataset2_f %>%
  group_by(ADM4_PCODE) %>%
  mutate(
    second_disaster_time = nth(numeric_time[ND_1.6_bar == 1], 2, default = NA)  # Second disaster time
  ) %>%
  ungroup()

#### Calculate the spatial lag for the treat for the robustness test (Appendix C) ####
## make the treated.spatial lag
map <- barangay %>% st_make_valid() %>% 
  dplyr::select(ADM4_PCODE) %>% filter(ADM4_PCODE != "PH097332") %>% mutate(ADM4_PCODE = str_replace(ADM4_PCODE, "^PH0*", ""))

fulldf_1 <-  map %>% right_join(dataset2_f) #HERE THERE ARE SOMETHING WRONG WITH PH097332, WHICH CORRESPONDS TO THREE ROWS

yearmonth <- c("2011_1", "2011_2", "2011_3", "2011_4", "2011_5", "2011_6", "2011_7", "2011_8", "2011_9", "2011_10", "2011_11", "2011_12",
               "2012_1", "2012_2", "2012_3", "2012_4", "2012_5", "2012_6", "2012_7", "2012_8", "2012_9", "2012_10", "2012_11", "2012_12",
               "2013_1", "2013_2", "2013_3", "2013_4", "2013_5", "2013_6", "2013_7", "2013_8", "2013_9", "2013_10", "2013_11", "2013_12",
               "2014_1", "2014_2", "2014_3", "2014_4", "2014_5", "2014_6", "2014_7", "2014_8", "2014_9", "2014_10", "2014_11", "2014_12")

#1.6
df_treated <- tibble()
for(i in yearmonth){
  tt <- Sys.time()
  print(i)
  fulldf_1_each_time <- fulldf_1 %>% filter(time == i )
  fulldf_1_nb <- spdep::poly2nb(fulldf_1_each_time, queen = TRUE)
  fulldf_1_nblist <- nb2listw(fulldf_1_nb,style = "W", zero.policy = TRUE) 
  
  tmpres <- tibble(ADM4_PCODE=fulldf_1_each_time$ADM4_PCODE,
                   
                   time= i,
                   
                   ND_nb=lag.listw(x = fulldf_1_nblist, var = fulldf_1_each_time$treated, zero.policy = TRUE, NAOK = TRUE))
  
  tt <- Sys.time()
  print(i)
  
  df_treated <- bind_rows(df_treated,tmpres)
  
  print(Sys.time()-tt)
}


df_treated$ND_nb_d[df_treated$ND_nb > 0] <- 1
df_treated$ND_nb_d[df_treated$ND_nb == 0] <- 0
df_treated <- df_treated %>% rename(NDtreated_nb = ND_nb) %>% 
  rename(NDtreated_nb_d = ND_nb_d)
df_treated_1 <- df_treated[df_treated$ADM4_PCODE != "PH097332",]

df_f <- fulldf_1 %>% 
  left_join(df_treated_1) 

df_f <- df_f %>% filter(year > 2011 & year < 2015) 


dataset2_f <- df_f

# Fourth, Exclude months after the second disaster
dataset2_filtered <- dataset2_f %>%
  filter(is.na(second_disaster_time) | numeric_time <= second_disaster_time)

# Fifth, I exclude the disaster-affected barangays in 2010 and 2011. 
test <- output_f %>% filter(year == 2010 | year == 2011) %>% 
  filter(z_value >= 1.6) %>% 
  # output_f is the data on the precipitation deviation genereated in 2)
  group_by(ADM4_PCODE) %>% 
  summarise(n=n()) # calculate the affected barangays in 2010 and 2011

test1_b <- test %>% mutate(ADM4_PCODE = str_replace(ADM4_PCODE, "^PH0*", ""))

dataset2_filtered <- dataset2_filtered %>%
  filter(!ADM4_PCODE %in% test1_b$ADM4_PCODE) %>% st_drop_geometry()

dataset2_filtered <- dataset2_filtered %>% dplyr::select(ADM4_PCODE, LATITUDE, LONGITUDE, year, month, time, ID_year, code,
                                                         npa, npa_after1, npa_after2, npa_after3, npa_after4, npa_after5, npa_after6,
                                                         treated, NDtreated_nb_d,
                                                         Infl2, Enclave_inf2, Mixed_inf2, Enclave_inf2_nb_d, Mixed_inf2_nb_d, rci, NightLight)

write_csv(dataset2_filtered, "./data/analysis/dataset2_staggeredadoption.csv", na = "NA") 
# the is the second month-level dataset for staggered adoption analysis, and can be found in the file - replication_files/data/analysis;
# it looks the battle-related violence as the outcome
########

